started: AL26Apr2019
last updated: AL17Sep2019

Summary

Input sequencing data: 1,838 vars x 712 samples ( 515BC = 258UBC + 257CBC and 197NFE)
Input eigenvectors for 712 samples (calculated using 215 non-rare variants not in LD)

Add eigenvectors to phenotypes, make PCA plots and
Mark 26 PC outliers: outside of mean +/- 3*sd on the top 2 eigenvectors

Start_section

Sys.time()
## [1] "2019-09-17 17:55:51 BST"
rm(list=ls())
graphics.off()

library(knitr)
## Warning: package 'knitr' was built under R version 3.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.2
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s03_explore_PCA_plots_exclude_outliers"

opts_knit$set(root.dir = base_folder)

options(stringsAsFactors = F)
options(warnPartialMatchArgs = T, 
        warnPartialMatchAttr = T, 
        warnPartialMatchDollar = T)

#options(error = browser()) # Type Q or c to exit, drop browser level
# https://support.rstudio.com/hc/en-us/articles/200713843?version=1.1.456&mode=desktop
# https://stackoverflow.com/questions/13052522/how-to-leave-the-r-browser-mode-in-the-console-window/13052588 

Read_data

# Threshold for outlier detection
th <- 3 # mean +/- (th*sd)

# Sequencing data (for wecare phenotypes: case/control status)
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s11_remove_BRCA_PALB_carriers"
load(paste(source_folder, "s03_exclude_BRCA1_BCRA2_PALB2_carriers.RData", sep="/"))
base_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s03_explore_PCA_plots_exclude_outliers"

# Eigenvectors & eigenvalues
source_folder="/Users/alexey/Documents/wecare/ampliseq/v04_ampliseq_nfe/s13_ampliseq-nfe_only_PCA/s02_calculate_ampliseq_nfe_only_PCs/data/s04_pca"

eigenvectors_file <- paste(source_folder, "ampliseq_nfe_215_712_100PCs.eigenvec", sep="/")
eigenvalues_file <- paste(source_folder, "ampliseq_nfe_215_712_100PCs.eigenval", sep="/")

eigenvectors.df <- read.table(eigenvectors_file, header=T, sep="\t",quote="")
eigenvalues.df <- read.table(eigenvalues_file, header=F, sep="\t",quote="")

# Clean-up
rm(source_folder, eigenvectors_file, eigenvalues_file)

Check data

# List of objects
ls()
## [1] "base_folder"     "eigenvalues.df"  "eigenvectors.df" "genotypes.mx"    "phenotypes.df"   "th"              "variants.df"
# Dimentions of objects
dim(eigenvectors.df)
## [1] 712 102
dim(eigenvalues.df)
## [1] 100   1
dim(genotypes.mx)
## [1] 1838  712
dim(phenotypes.df)
## [1] 712  24
dim(variants.df)
## [1] 1838   65
# Counts of nfe/controls/cases
table(phenotypes.df$cc)
## 
##  -1   0   1 
## 197 258 257
# Update eigenvectors
rownames(eigenvectors.df) <- eigenvectors.df$FID
eigenvectors.df <- eigenvectors.df[,-1]
"long_ids" -> colnames(eigenvectors.df)[1]
eigenvectors.df[1:5,1:5]
##                    long_ids        PC1         PC2         PC3         PC4
## 100_S8_L007     100_S8_L007 -0.0247118  0.00577574 -0.01270700  0.00726611
## 101_S528_L008 101_S528_L008 -0.0154747 -0.00145424 -0.00755583 -0.03030170
## 102_S466_L008 102_S466_L008  0.0169168  0.07628740  0.00574448  0.01463160
## 103_S147_L007 103_S147_L007  0.1431820  0.03840790 -0.03832170 -0.04580630
## 104_S325_L008 104_S325_L008  0.0129566 -0.02575130 -0.07195110 -0.02135050

Plot eigenvalues

plot(eigenvalues.df$V1, type="b", 
     xlab="Eigenvector", ylab="Eigenvalue",
     ylim=c(0,20),main="Top 100 eigenvectors")

plot(eigenvalues.df$V1[1:20], type="b", 
     xlab="Eigenvector", ylab="Eigenvalue",
     ylim=c(0,20), main="Top 20 eigenvectors")

rm(eigenvalues.df)

Prepare data for plots

# Add factor column with verbal lables for phenotypes to be used in plot
group <- factor(phenotypes.df$cc, levels = c(0,1,-1), labels = c("UBC","CBC","NFE"))
phenotypes.df <- data.frame(phenotypes.df,group)
table(phenotypes.df$group)
## 
## UBC CBC NFE 
## 258 257 197
# Add column with sample IDs for the plot
sample_id <- phenotypes.df$Sample_num
sample_id[phenotypes.df$group=="NFE"] <- phenotypes.df$long_ids[phenotypes.df$group=="NFE"]
phenotypes.df <- data.frame(phenotypes.df,sample_id)
head(phenotypes.df$sample_id)
## [1] "100" "101" "102" "103" "104" "105"
tail(phenotypes.df$sample_id)
## [1] "NA20819" "NA20821" "NA20822" "NA20826" "NA20828" "NA20832"
# Add 5 top eigenvectors to phenotypes
phenotypes.df <- full_join(phenotypes.df,eigenvectors.df[,1:6], by="long_ids")
rownames(phenotypes.df) <- phenotypes.df$long_ids

# Prepare colour scale for ggplots
myColours <- c("NFE"="green", "UBC"="blue", "CBC"="red")
myColourScale <- scale_colour_manual(values=myColours)

# Clean-up
rm(group, sample_id, eigenvectors.df, myColours)

Calculate thresholds for outliers

pc1_mean <- mean(phenotypes.df$PC1)
pc1_sd <- sd(phenotypes.df$PC1)
pc1_min <- pc1_mean - pc1_sd * th
pc1_max <- pc1_mean + pc1_sd * th

pc2_mean <- mean(phenotypes.df$PC2)
pc2_sd <- sd(phenotypes.df$PC2)
pc2_min <- pc2_mean - pc2_sd * th
pc2_max <- pc2_mean + pc2_sd * th

pc3_mean <- mean(phenotypes.df$PC3)
pc3_sd <- sd(phenotypes.df$PC3)
pc3_min <- pc3_mean - pc3_sd * th
pc3_max <- pc3_mean + pc3_sd * th

pc4_mean <- mean(phenotypes.df$PC4)
pc4_sd <- sd(phenotypes.df$PC4)
pc4_min <- pc4_mean - pc4_sd * th
pc4_max <- pc4_mean + pc4_sd * th

pc5_mean <- mean(phenotypes.df$PC5)
pc5_sd <- sd(phenotypes.df$PC5)
pc5_min <- pc5_mean - pc5_sd * th
pc5_max <- pc5_mean + pc5_sd * th

rm(pc1_mean, pc1_sd, 
   pc2_mean, pc2_sd, 
   pc3_mean, pc3_sd, 
   pc4_mean, pc4_sd, 
   pc5_mean, pc5_sd,
   th)

Mark outliers

# Detect outliers on each PC
pc1_outlier <- phenotypes.df$PC1 < pc1_min | phenotypes.df$PC1 > pc1_max
pc2_outlier <- phenotypes.df$PC2 < pc2_min | phenotypes.df$PC2 > pc2_max
pc3_outlier <- phenotypes.df$PC3 < pc3_min | phenotypes.df$PC3 > pc3_max
pc4_outlier <- phenotypes.df$PC4 < pc4_min | phenotypes.df$PC4 > pc4_max
pc5_outlier <- phenotypes.df$PC5 < pc5_min | phenotypes.df$PC5 > pc5_max

# Detect outliers on any PC
pc_outlier <- pc1_outlier | pc2_outlier

#pc_outlier <- pc1_outlier | pc2_outlier | pc3_outlier | pc4_outlier | pc5_outlier

# Check counts of outliers
sum(pc1_outlier)
## [1] 18
sum(pc2_outlier)
## [1] 9
sum(pc3_outlier)
## [1] 3
sum(pc4_outlier)
## [1] 7
sum(pc5_outlier)
## [1] 3
sum(pc_outlier)
## [1] 26
# Add outliers data to phenotypes dataframe
phenotypes.df <- data.frame(phenotypes.df, pc_outlier, pc1_outlier, pc2_outlier,
                            pc3_outlier, pc4_outlier, pc5_outlier)

phenotypes.df %>%
  filter(pc_outlier) %>% 
  select(sample_id, pc_outlier, pc1_outlier, pc2_outlier,
         pc3_outlier, pc4_outlier, pc5_outlier) %>% 
  arrange(as.integer(sample_id))
##    sample_id pc_outlier pc1_outlier pc2_outlier pc3_outlier pc4_outlier pc5_outlier
## 1          3       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 2         16       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 3        103       TRUE        TRUE       FALSE       FALSE       FALSE        TRUE
## 4        141       TRUE        TRUE        TRUE       FALSE       FALSE       FALSE
## 5        148       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 6        235       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 7        238       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 8        246       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 9        256       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 10       267       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 11       273       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 12       275       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 13       277       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 14       287       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 15       293       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 16       308       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 17       326       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 18       329       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 19       347       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 20       366       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 21       368       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 22       369       TRUE       FALSE        TRUE       FALSE       FALSE       FALSE
## 23       385       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 24       388       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 25       403       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
## 26       408       TRUE        TRUE       FALSE       FALSE       FALSE       FALSE
# Clean-up
rm(pc_outlier, pc1_outlier, pc2_outlier, pc3_outlier, pc4_outlier, pc5_outlier)

PC1 vs PC2

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC1,PC2)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC1 vs PC2", x="PC1", y="PC2") +
  theme(legend.title=element_blank()) + # To suppress the legend title
  myColourScale +                       # otherwise it would be "group"
  geom_vline(xintercept=c(pc1_min, pc1_max), linetype="dotted", size=0.2) +
  geom_hline(yintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)

PC2 vs PC3

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC2,PC3)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC2 vs PC3", x="PC2", y="PC3") +
  theme(legend.title=element_blank()) + 
  myColourScale + 
  geom_vline(xintercept=c(pc2_min, pc2_max), linetype="dotted", size=0.2)
## Warning: Ignoring unknown aesthetics: text
# geom_hline(yintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2)

# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)

PC3 vs PC4

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC3,PC4)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC3 vs PC4", x="PC3", y="PC4") +
  theme(legend.title=element_blank()) + 
  myColourScale 
## Warning: Ignoring unknown aesthetics: text
#  geom_vline(xintercept=c(pc3_min, pc3_max), linetype="dotted", size=0.2) +
#  geom_hline(yintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2)

# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g)

PC4 vs PC5

# Prepare ggplot
g <- ggplot(phenotypes.df, aes(PC4,PC5)) + 
  geom_point(aes(col=group, text=sample_id)) +
  labs(title="PC4 vs PC5", x="PC4", y="PC5") +
  theme(legend.title=element_blank()) + 
  myColourScale
## Warning: Ignoring unknown aesthetics: text
#  geom_vline(xintercept=c(pc4_min, pc4_max), linetype="dotted", size=0.2) +
#  geom_hline(yintercept=c(pc5_min, pc5_max), linetype="dotted", size=0.2)

# Draw static ggplot
#g

# Draw dynamic ggplotly
ggplotly(g, tooltip="text") # By default the tooltip would also show coordinates 
## Warning in dev_fun(file = tempfile(), width = width %||% 640, height = height %||% : partial argument match of 'file' to 'filename'
# Clean-up
rm(g, myColourScale, 
   pc1_min, pc1_max, pc2_min, pc2_max, pc3_min, pc3_max, pc4_min, pc4_max, pc5_min, pc5_max)

Save results

save.image(paste(base_folder, "s01_add_PCs.RData", sep="/"))

Final_section

ls()
## [1] "base_folder"   "genotypes.mx"  "phenotypes.df" "variants.df"
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] plotly_4.9.0  ggplot2_3.2.0 dplyr_0.8.1   knitr_1.23   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.1        later_0.8.0       pillar_1.4.1      compiler_3.5.1    tools_3.5.1       digest_0.6.19     jsonlite_1.6      evaluate_0.14     tibble_2.1.3      gtable_0.3.0      viridisLite_0.3.0 pkgconfig_2.0.2   rlang_0.3.4       shiny_1.3.2       crosstalk_1.0.0   yaml_2.2.0        xfun_0.7          withr_2.1.2       stringr_1.4.0     httr_1.4.0        htmlwidgets_1.3   grid_3.5.1        tidyselect_0.2.5  glue_1.3.1        data.table_1.12.2 R6_2.4.0          rmarkdown_1.13    purrr_0.3.2       tidyr_0.8.3       magrittr_1.5      promises_1.0.1    scales_1.0.0      htmltools_0.3.6   assertthat_0.2.1  xtable_1.8-4      mime_0.7          colorspace_1.4-1  httpuv_1.5.1      labeling_0.3      stringi_1.4.3     lazyeval_0.2.2    munsell_0.5.0     crayon_1.3.4
Sys.time()
## [1] "2019-09-17 17:55:54 BST"